echo = FALSE parameter is added to the code chunk to prevent printing of the R code that generated the plot.# code to remove objects in Environment before knitting
rm(list = ls())
books <- read.csv(here::here("datasets", "books.csv"))
books_two <- read.csv(here::here("datasets", "books_two.csv"))
# ---------------------------------------
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
# ---------------------------------------
# ---------------------------------------
# data visualization
# --------------------------------------
library('ggplot2')
library('plotly')
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library('gganimate')
library('ggridges')
# ---------------------------------------
# data manipulation
# --------------------------------------
library('forcats')
library('tidyverse')
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ purrr 0.3.4
## ✓ tidyr 1.1.2 ✓ dplyr 1.0.2
## ✓ readr 1.3.1 ✓ stringr 1.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x readr::spec() masks yardstick::spec()
library('magrittr')
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library('lubridate')
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library('dplyr')
library('DT')
#install.packages("formattable")
#install.packages("tidyr")
library('formattable')
##
## Attaching package: 'formattable'
## The following object is masked from 'package:plotly':
##
## style
library('tidyr')
library('data.table')
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library('kableExtra')
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# ---------------------------------------
# sentiment analysis
# --------------------------------------
library('sentimentr')
##
## Attaching package: 'sentimentr'
## The following object is masked from 'package:plotly':
##
## highlight
# ---------------------------------------
# summary statistics
# --------------------------------------
#install.packages("qwraps2")
library("qwraps2")
# ---------------------------------------
# model validation library
# ---------------------------------------
library('rsample')
# ---------------------------------------
# generalized linear model libraries
# ---------------------------------------
library('glmnet')
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.0-2
library('glmnetUtils')
##
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
##
## cv.glmnet, glmnet
# ---------------------------------------
# regression output
# ---------------------------------------
# install.packages('sjPlot')
library('sjPlot')
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
# install.packages('sjPlot')
library('tidymodels')
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.1 ──
## ✓ broom 0.7.0 ✓ parsnip 0.1.3
## ✓ dials 0.0.9 ✓ recipes 0.1.13
## ✓ infer 0.5.3 ✓ tune 0.1.1
## ✓ modeldata 0.0.2 ✓ workflows 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## x data.table::between() masks dplyr::between()
## x scales::discard() masks purrr::discard()
## x Matrix::expand() masks tidyr::expand()
## x magrittr::extract() masks tidyr::extract()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x data.table::first() masks dplyr::first()
## x recipes::fixed() masks stringr::fixed()
## x kableExtra::group_rows() masks dplyr::group_rows()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x Matrix::pack() masks tidyr::pack()
## x magrittr::set_names() masks purrr::set_names()
## x readr::spec() masks yardstick::spec()
## x recipes::step() masks stats::step()
## x data.table::transpose() masks purrr::transpose()
## x Matrix::unpack() masks tidyr::unpack()
# ---------------------------------------
# random forest libraries
# ---------------------------------------
library('partykit')
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
#library('tidyverse')
library('PerformanceAnalytics')
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library('rpart')
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library('rpart.plot')
library('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#install.packages("randomForestExplainer")
library('randomForestExplainer')
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# ---------------------------------------
# lasso libraries
# ---------------------------------------
library('broom')
library('coefplot')
##
## Attaching package: 'coefplot'
## The following object is masked from 'package:qwraps2':
##
## invlogit
# ---------------------------------------
books <- books %>% rename(avg_book_rating = average_rating,
book_ratings_count = ratings_count,
author = authors)
books_two <- books_two %>% rename(author = authors,
authorworkcount = workcount,
author_fans = fan_count,
avg_author_rating = average_rate,
author_ratings_count = rating_count,
author_review_count = review_count,
)
Need ‘sentimentr’ library.
sentiment_DF <- get_sentences(books$title) %>% sentiment_by(books$title)
head(sentiment_DF)
## title
## 1:
## 2: said the shotgun to the head.
## 3: $30 Film School: How to Write Direct Produce Shoot Edit Distribute Tour With and Sell Your Own No-Budget Digital Movie
## 4: 'Salem's Lot
## 5: 1 000 Places to See Before You Die
## 6: 10 lb Penalty
## word_count sd ave_sentiment
## 1: 0 0 0.0000000
## 2: 6 NA -0.1632993
## 3: 20 NA -0.1118034
## 4: 16 0 0.0000000
## 5: 6 NA -0.3061862
## 6: 2 NA -0.5303301
books_s <- inner_join(x = books,
y = sentiment_DF,
by = "title")
head(books_s)
## bookID
## 1 1
## 2 2
## 3 4
## 4 5
## 5 8
## 6 9
## title
## 1 Harry Potter and the Half-Blood Prince (Harry Potter #6)
## 2 Harry Potter and the Order of the Phoenix (Harry Potter #5)
## 3 Harry Potter and the Chamber of Secrets (Harry Potter #2)
## 4 Harry Potter and the Prisoner of Azkaban (Harry Potter #3)
## 5 Harry Potter Boxed Set Books 1-5 (Harry Potter #1-5)
## 6 Unauthorized Harry Potter Book Seven News: Half-Blood Prince Analysis and Speculation
## author avg_book_rating isbn isbn13
## 1 J.K. Rowling/Mary GrandPré 4.57 0439785960 9780439785969
## 2 J.K. Rowling/Mary GrandPré 4.49 0439358078 9780439358071
## 3 J.K. Rowling 4.42 0439554896 9780439554893
## 4 J.K. Rowling/Mary GrandPré 4.56 043965548X 9780439655484
## 5 J.K. Rowling/Mary GrandPré 4.78 0439682584 9780439682589
## 6 W. Frederick Zimmerman 3.74 0976540606 9780976540601
## language_code num_pages book_ratings_count text_reviews_count
## 1 eng 652 2095690 27591
## 2 eng 870 2153167 29221
## 3 eng 352 6333 244
## 4 eng 435 2339585 36325
## 5 eng 2690 41428 164
## 6 en-US 152 19 1
## publication_date publisher word_count sd ave_sentiment
## 1 9/16/2006 Scholastic Inc. 18 0 0.2000000
## 2 9/1/2004 Scholastic Inc. 10 NA 0.0000000
## 3 11/1/2003 Scholastic 18 0 0.0000000
## 4 5/1/2004 Scholastic Inc. 18 0 -0.2500000
## 5 9/13/2004 Scholastic 7 NA 0.0000000
## 6 4/26/2005 Nimble Books 12 NA -0.1299038
head(books_two)
## authorid author authorworkcount author_fans gender
## 1 8409092 Jason Wallace 2 13 male
## 2 5796406 Rosan Hollak 4 0 unknown
## 3 8421525 Nanna Foss 6 156 female
## 4 158146 Terri Savelle Foy 23 125 female
## 5 15340731 Vishwas Nangare Patil 1 127 unknown
## 6 7189636 Shweta Punj 2 3 unknown
## image_url
## 1 https://images.gr-assets.com/authors/1489266848p7/8409092.jpg
## 2 https://s.gr-assets.com/assets/nophoto/user/u_333x500-46491541e26dbeac15f51487d68dd207.png
## 3 https://images.gr-assets.com/authors/1409085874p7/8421525.jpg
## 4 https://images.gr-assets.com/authors/1475694606p7/158146.jpg
## 5 https://s.gr-assets.com/assets/nophoto/user/u_333x500-46491541e26dbeac15f51487d68dd207.png
## 6 https://s.gr-assets.com/assets/nophoto/user/u_333x500-46491541e26dbeac15f51487d68dd207.png
## about
## 1 Jason Wallace is related to Tolkien and a descendant of one of the first International English cricketers, and also of the world-renowned Victorian circus owner 'Lord' George Sanger. He was born in Cheltenham in 1969 but moved to London after his parents split up. Aged 12 his life was turned upside down when his mother remarried and the family emigrated to Zimbabwe. It was this experience in a tough boarding school during the aftermath of the war for independence that forms the foundation of his incredible first novel, Out of Shadows. And he did actually meet Robert Mugabe when he visited his school.<br /><br />Jason is currently a web designer, living in South West London with his partner and son.<br />
## 2
## 3 Danish YA writer. <br /><br />I make up stories about time travel, friendship, love, and mysterious adventures. <br /><br />I like to read books that make me cry or laugh or both.<br /><br />Danish Social Media:<br />Instagram: <a target="_blank" href="http://www.instagram.com/nannafoss" rel="nofollow">www.instagram.com/nannafoss</a><br />Facebook: <a target="_blank" href="http://www.facebook.com/nannafoss" rel="nofollow">www.facebook.com/nannafoss</a><br />Blog: <a target="_blank" href="http://www.nannafoss.dk" rel="nofollow">www.nannafoss.dk</a> <br /><br />English Social Media:<br />Tumblr: <a target="_blank" href="http://www.nannafoss.tumblr.com" rel="nofollow">www.nannafoss.tumblr.com</a><br /><br />YouTube playlists for my books:<br /><a target="_blank" href="http://www.youtube.com/channel/UCUL5UmL4QUSPyywz4ULc-gg/playlists" rel="nofollow">www.youtube.com/channel/UCUL5UmL4QUSP...</a>
## 4 For years, Terri Savelle Foy’s life was average. She had no dreams to pursue. Each passing day was just a repeat of the day before. Finally, with a marriage in trouble and her life falling apart, Terri made a change. She began to pursue God like never before, develop a new routine and discovered the power of having a dream and purpose.<br /><br />As Terri started to recognize her own dreams and goals, she simply wrote them down and reviewed them consistently. This written vision became a road map to drive her life. As a result, those dreams are now a reality.<br /><br />Terri has become the CEO of an international Christian ministry. She is an author, a conference speaker, and a success coach to hundreds of thousands of people all over the world. Her best-selling books Make Your Dreams Bigger than Your Memories, and Imagine Big have helped people discover how to overcome the hurts of the past and see the possibilities of a limitless future. Her weekly podcast is a lifeline of hope and inspiration to people around the world.<br /><br />Terri Savelle Foy is a cheerleader of dreams and is convinced that “if you can dream it, God can do it.” She is known across the globe as a world-class motivator of hope and success through her transparent and humorous teaching style. Terri’s unique ability to communicate success strategies in a simple and practical way has awakened the dreams of the young and old alike. <br /><br />Terri shares from personal experience the biblical concepts of using the gift of the imagination to reach full potential in Jesus Christ. From stay-at-home moms to business executives, Terri consistently inspires others to go after their dreams. With step-by-step instruction and the inspiration to follow through, people are fueled with the passion to complete their life assignment down to the last detail (see John 17:4).<br /><br />Terri and her husband, Rodney Foy, have been married since 1991, and are the parents of a beautiful redheaded daughter, Kassidi Cherie. They live near Dallas, Texas. For more information about Terri, go to <a target="_blank" href="http://www.terri.com" rel="nofollow">www.terri.com</a>.<br />
## 5
## 6
## born died influence avg_author_rating author_ratings_count
## 1 3.74 1028
## 2 3.73 15
## 3 4.35 1172
## 4 4.56 1054
## 5 4.15 725
## 6 3.65 231
## author_review_count website twitter
## 1 175
## 2 1
## 3 205 http://www.nannafoss.dk
## 4 151 http://www.terri.com terrisavellefoy
## 5 43
## 6 25
## genre original_hometown
## 1 Cheltenham
## 2
## 3 fantasy,fiction,paranormal
## 4 religion and spirituality,self help,spirituality
## 5
## 6
## country latitude longitude
## 1 United Kingdom 51.90006 -2.07972
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
books_sa <-
inner_join(x = books_s,
y = books_two,
by = "author")
head(books_sa)
## bookID
## 1 4
## 2 10
## 3 12
## 4 13
## 5 14
## 6 18
## title
## 1 Harry Potter and the Chamber of Secrets (Harry Potter #2)
## 2 Harry Potter Collection (Harry Potter #1-6)
## 3 The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy #1-5)
## 4 The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1-5)
## 5 The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1)
## 6 The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy #1-5)
## author avg_book_rating isbn isbn13 language_code
## 1 J.K. Rowling 4.42 0439554896 9780439554893 eng
## 2 J.K. Rowling 4.73 0439827604 9780439827607 eng
## 3 Douglas Adams 4.38 0517226952 9780517226957 eng
## 4 Douglas Adams 4.38 0345453743 9780345453747 eng
## 5 Douglas Adams 4.22 1400052920 9781400052929 eng
## 6 Douglas Adams 4.38 0517149257 9780517149256 eng
## num_pages book_ratings_count text_reviews_count publication_date
## 1 352 6333 244 11/1/2003
## 2 3342 28242 808 9/12/2005
## 3 815 3628 254 11/1/2005
## 4 815 249558 4080 4/30/2002
## 5 215 4930 460 8/3/2004
## 6 815 2877 195 1/17/1996
## publisher word_count sd ave_sentiment authorid authorworkcount
## 1 Scholastic 18 0 0.0000000 1077326 242
## 2 Scholastic 5 NA 0.0000000 1077326 242
## 3 Gramercy Books 15 NA 0.3098387 4 103
## 4 Del Rey Books 12 NA 0.3464102 4 103
## 5 Crown 44 0 0.3618136 4 103
## 6 Wings Books 9 NA 0.4000000 4 103
## author_fans gender
## 1 209174 female
## 2 209174 female
## 3 19029 male
## 4 19029 male
## 5 19029 male
## 6 19029 male
## image_url
## 1 https://images.gr-assets.com/authors/1510435123p7/1077326.jpg
## 2 https://images.gr-assets.com/authors/1510435123p7/1077326.jpg
## 3 https://images.gr-assets.com/authors/1189120061p7/4.jpg
## 4 https://images.gr-assets.com/authors/1189120061p7/4.jpg
## 5 https://images.gr-assets.com/authors/1189120061p7/4.jpg
## 6 https://images.gr-assets.com/authors/1189120061p7/4.jpg
## about
## 1 See also: <a href="https://www.goodreads.com/author/show/383606.Robert_Galbraith" title="Robert Galbraith" rel="nofollow">Robert Galbraith</a><br />Although she writes under the pen name <b>J.K. Rowling</b>, pronounced like <i>rolling</i>, her name when her first <i>Harry Potter</i> book was published was simply <b>Joanne Rowling</b>. Anticipating that the target audience of young boys might not want to read a book written by a woman, her publishers demanded that she use two initials, rather than her full name. As she had no middle name, she chose <b>K</b> as the second initial of her pen name, from her paternal grandmother Kathleen Ada Bulgen Rowling. She calls herself <b>Jo</b> and has said, "No one ever called me 'Joanne' when I was young, unless they were angry." Following her marriage, she has sometimes used the name <b>Joanne Murray</b> when conducting personal business. During the Leveson Inquiry she gave evidence under the name of <b>Joanne Kathleen Rowling</b>. In a 2012 interview, Rowling noted that she no longer cared that people pronounced her name incorrectly.<br /><br />Rowling was born to Peter James Rowling, a Rolls-Royce aircraft engineer, and Anne Rowling (née Volant), on 31 July 1965 in Yate, Gloucestershire, England, 10 miles (16 km) northeast of Bristol. Her mother Anne was half-French and half-Scottish. Her parents first met on a train departing from King's Cross Station bound for Arbroath in 1964. They married on 14 March 1965. Her mother's maternal grandfather, Dugald Campbell, was born in Lamlash on the Isle of Arran. Her mother's paternal grandfather, Louis Volant, was awarded the Croix de Guerre for exceptional bravery in defending the village of Courcelles-le-Comte during the First World War.<br /><br />Rowling's sister Dianne was born at their home when Rowling was 23 months old. The family moved to the nearby village Winterbourne when Rowling was four. She attended St Michael's Primary School, a school founded by abolitionist William Wilberforce and education reformer Hannah More. Her headmaster at St Michael's, Alfred Dunn, has been suggested as the inspiration for the <i>Harry Potter</i> headmaster Albus Dumbledore.<br /><br />As a child, Rowling often wrote fantasy stories, which she would usually then read to her sister. She recalls that: "I can still remember me telling her a story in which she fell down a rabbit hole and was fed strawberries by the rabbit family inside it. Certainly the first story I ever wrote down (when I was five or six) was about a rabbit called Rabbit. He got the measles and was visited by his friends, including a giant bee called Miss Bee." At the age of nine, Rowling moved to Church Cottage in the Gloucestershire village of Tutshill, close to Chepstow, Wales. When she was a young teenager, her great aunt, who Rowling said "taught classics and approved of a thirst for knowledge, even of a questionable kind," gave her a very old copy of Jessica Mitford's autobiography, <i>Hons and Rebels</i>. Mitford became Rowling's heroine, and Rowling subsequently read all of her books.<br /><br />Rowling has said of her teenage years, in an interview with The New Yorker, "I wasn’t particularly happy. I think it’s a dreadful time of life." She had a difficult homelife; her mother was ill and she had a difficult relationship with her father (she is no longer on speaking terms with him). She attended secondary school at Wyedean School and College, where her mother had worked as a technician in the science department. Rowling said of her adolescence, "Hermione [a bookish, know-it-all <i>Harry Potter</i> character] is loosely based on me. She's a caricature of me when I was eleven, which I'm not particularly proud of." Steve Eddy, who taught Rowling English when she first arrived, remembers her as "not exceptional" but "one of a group of girls who were bright, and quite good at English." Sean Harris, her best friend in the Upper Sixth owned a turquoise Ford Anglia, which she says inspired the one in her books.
## 2 See also: <a href="https://www.goodreads.com/author/show/383606.Robert_Galbraith" title="Robert Galbraith" rel="nofollow">Robert Galbraith</a><br />Although she writes under the pen name <b>J.K. Rowling</b>, pronounced like <i>rolling</i>, her name when her first <i>Harry Potter</i> book was published was simply <b>Joanne Rowling</b>. Anticipating that the target audience of young boys might not want to read a book written by a woman, her publishers demanded that she use two initials, rather than her full name. As she had no middle name, she chose <b>K</b> as the second initial of her pen name, from her paternal grandmother Kathleen Ada Bulgen Rowling. She calls herself <b>Jo</b> and has said, "No one ever called me 'Joanne' when I was young, unless they were angry." Following her marriage, she has sometimes used the name <b>Joanne Murray</b> when conducting personal business. During the Leveson Inquiry she gave evidence under the name of <b>Joanne Kathleen Rowling</b>. In a 2012 interview, Rowling noted that she no longer cared that people pronounced her name incorrectly.<br /><br />Rowling was born to Peter James Rowling, a Rolls-Royce aircraft engineer, and Anne Rowling (née Volant), on 31 July 1965 in Yate, Gloucestershire, England, 10 miles (16 km) northeast of Bristol. Her mother Anne was half-French and half-Scottish. Her parents first met on a train departing from King's Cross Station bound for Arbroath in 1964. They married on 14 March 1965. Her mother's maternal grandfather, Dugald Campbell, was born in Lamlash on the Isle of Arran. Her mother's paternal grandfather, Louis Volant, was awarded the Croix de Guerre for exceptional bravery in defending the village of Courcelles-le-Comte during the First World War.<br /><br />Rowling's sister Dianne was born at their home when Rowling was 23 months old. The family moved to the nearby village Winterbourne when Rowling was four. She attended St Michael's Primary School, a school founded by abolitionist William Wilberforce and education reformer Hannah More. Her headmaster at St Michael's, Alfred Dunn, has been suggested as the inspiration for the <i>Harry Potter</i> headmaster Albus Dumbledore.<br /><br />As a child, Rowling often wrote fantasy stories, which she would usually then read to her sister. She recalls that: "I can still remember me telling her a story in which she fell down a rabbit hole and was fed strawberries by the rabbit family inside it. Certainly the first story I ever wrote down (when I was five or six) was about a rabbit called Rabbit. He got the measles and was visited by his friends, including a giant bee called Miss Bee." At the age of nine, Rowling moved to Church Cottage in the Gloucestershire village of Tutshill, close to Chepstow, Wales. When she was a young teenager, her great aunt, who Rowling said "taught classics and approved of a thirst for knowledge, even of a questionable kind," gave her a very old copy of Jessica Mitford's autobiography, <i>Hons and Rebels</i>. Mitford became Rowling's heroine, and Rowling subsequently read all of her books.<br /><br />Rowling has said of her teenage years, in an interview with The New Yorker, "I wasn’t particularly happy. I think it’s a dreadful time of life." She had a difficult homelife; her mother was ill and she had a difficult relationship with her father (she is no longer on speaking terms with him). She attended secondary school at Wyedean School and College, where her mother had worked as a technician in the science department. Rowling said of her adolescence, "Hermione [a bookish, know-it-all <i>Harry Potter</i> character] is loosely based on me. She's a caricature of me when I was eleven, which I'm not particularly proud of." Steve Eddy, who taught Rowling English when she first arrived, remembers her as "not exceptional" but "one of a group of girls who were bright, and quite good at English." Sean Harris, her best friend in the Upper Sixth owned a turquoise Ford Anglia, which she says inspired the one in her books.
## 3 Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## 4 Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## 5 Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## 6 Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## born died
## 1 7/31/65
## 2 7/31/65
## 3 3/11/52 5/11/01
## 4 3/11/52 5/11/01
## 5 3/11/52 5/11/01
## 6 3/11/52 5/11/01
## influence
## 1 C.S. Lewis,Oscar Wilde,Geoffrey Chaucer,Jane Austen,
## 2 C.S. Lewis,Oscar Wilde,Geoffrey Chaucer,Jane Austen,
## 3 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## 4 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## 5 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## 6 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## avg_author_rating author_ratings_count author_review_count
## 1 4.46 24511114 579250
## 2 4.46 24511114 579250
## 3 4.20 2624222 57565
## 4 4.20 2624222 57565
## 5 4.20 2624222 57565
## 6 4.20 2624222 57565
## website twitter genre
## 1 http://www.jkrowling.com jk_rowling fantasy,fiction,young adult
## 2 http://www.jkrowling.com jk_rowling fantasy,fiction,young adult
## 3 http://www.douglasadams.com/ comedy,fiction,mystery and thrillers
## 4 http://www.douglasadams.com/ comedy,fiction,mystery and thrillers
## 5 http://www.douglasadams.com/ comedy,fiction,mystery and thrillers
## 6 http://www.douglasadams.com/ comedy,fiction,mystery and thrillers
## original_hometown country latitude longitude
## 1 Yate, South Gloucestershire, England United Kingdom 51.54074 -2.41839
## 2 Yate, South Gloucestershire, England United Kingdom 51.54074 -2.41839
## 3 Cambridge, England United Kingdom 52.20000 0.11667
## 4 Cambridge, England United Kingdom 52.20000 0.11667
## 5 Cambridge, England United Kingdom 52.20000 0.11667
## 6 Cambridge, England United Kingdom 52.20000 0.11667
# mutate to correct column data types
books_1 <- books_sa %>% mutate(num_pages = as.numeric(num_pages),
avg_book_rating = as.numeric(avg_book_rating),
text_reviews_count = as.numeric(text_reviews_count),
publication_date = as.Date(publication_date, format="%m/%d/%Y"),
born = as.Date(born, format="%m/%d/%Y"),
died = as.Date(died, format="%m/%d/%Y"),
gender = as.factor(gender)
)
# remove NAs
books_total <- books_1 %>%
filter(
(!is.na(avg_book_rating)), (!is.na(book_ratings_count)), (!is.na(text_reviews_count)), (!is.na(publication_date)),
(!duplicated(title)),
(avg_book_rating != 0),
(author != "NOT A BOOK"),
(!is_greater_than(num_pages, 2000)),
(num_pages != 0),
(bookID != 9796),
(!is_less_than(num_pages, 10))
)
# remove irrelevant variables (11):
# sd(standard deviation of words in title), author ID, image_URL, about, influence, website, twitter, original hometown, country, latitude, longitude
books_corti <- books_total %>% select(-isbn13,
-sd,
-authorid,
-image_url,
-about,
-influence,
-website,
-twitter,
-original_hometown,
-country,
-latitude,
-longitude) %>% rename(
title_sentiment_avg = ave_sentiment,
title_word_count = word_count
)
# View(books_corti)
# NA VISUALIZATION
# to see the number of missing values in each column
# STEPS:
# 1) We need to sum through every column using a FOR loop.
# 2) Then print the variable name using names(movies[i]).
# 3) Finally, we print the sum of is.na() for just that variable.
# FOR loop to see each column in books data set
for(i in 1:ncol(books_corti)){
# print the following
print(
# first print "Variable: "
paste0("Variable: ",
# then print the variable name, then "NAs: "
names(books_corti)[i], " NAs: ",
# then print the sum of the number of missing values
# for that variable
sum(is.na(books_corti %>% select(i)))
)
)
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(i)` instead of `i` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] "Variable: bookID NAs: 0"
## [1] "Variable: title NAs: 0"
## [1] "Variable: author NAs: 0"
## [1] "Variable: avg_book_rating NAs: 0"
## [1] "Variable: isbn NAs: 0"
## [1] "Variable: language_code NAs: 0"
## [1] "Variable: num_pages NAs: 0"
## [1] "Variable: book_ratings_count NAs: 0"
## [1] "Variable: text_reviews_count NAs: 0"
## [1] "Variable: publication_date NAs: 0"
## [1] "Variable: publisher NAs: 0"
## [1] "Variable: title_word_count NAs: 0"
## [1] "Variable: title_sentiment_avg NAs: 0"
## [1] "Variable: authorworkcount NAs: 0"
## [1] "Variable: author_fans NAs: 0"
## [1] "Variable: gender NAs: 0"
## [1] "Variable: born NAs: 2652"
## [1] "Variable: died NAs: 4210"
## [1] "Variable: avg_author_rating NAs: 0"
## [1] "Variable: author_ratings_count NAs: 0"
## [1] "Variable: author_review_count NAs: 0"
## [1] "Variable: genre NAs: 0"
# starts_with() function for certain columns...
books_corti %>% select(starts_with("isbn")) %>% glimpse()
## Rows: 5,914
## Columns: 1
## $ isbn <chr> "0439554896", "0517226952", "0345453743", "1400052920", "0517149…
# exploring first 10 rows using slice() function
explore_data <- books_corti %>% arrange(desc(avg_book_rating)) %>% slice(1:10) %>% select(title, author, avg_book_rating)
print(explore_data)
## title
## 1 Zone of the Enders: The 2nd Runner Official Strategy Guide
## 2 The Diamond Color Meditation: Color Pathway to the Soul
## 3 Taxation of Mineral Rents
## 4 The Irish Anatomist: A Study of Flann O'Brien
## 5 His Princess Devotional: A Royal Encounter With Your King
## 6 Stargirl LitPlans on CD
## 7 The Complete Calvin and Hobbes
## 8 Wissenschaft der Logik: Die Lehre Vom Begriff (1816)
## 9 It's a Magical World (Calvin and Hobbes #11)
## 10 Homicidal Psycho Jungle Cat (Calvin and Hobbes #9)
## author avg_book_rating
## 1 Tim Bogenn 5.00
## 2 John Diamond 5.00
## 3 Ross Garnaut 5.00
## 4 Keith Donohue 5.00
## 5 Sheri Rose Shepherd 5.00
## 6 Mary B. Collins 4.86
## 7 Bill Watterson 4.82
## 8 Georg Wilhelm Friedrich Hegel 4.78
## 9 Bill Watterson 4.76
## 10 Bill Watterson 4.72
datatable(books_corti)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
# ONLY select "NOT A BOOK" under author variable (a.k.a. the column) and store this as a new data frame
not_a_book <- books_corti %>% filter(author == "NOT A BOOK") %>% nrow()
print(not_a_book)
## [1] 0
Linear regression analysis is sensitive to outliers. Use histogram to see where this will occur.
ggplot(books_corti, aes(x = avg_book_rating)) +
xlab("Average Book Rating") +
ylab("Count") +
geom_histogram(fill = "skyblue", color = "#879bcd") +
theme_dark(base_size = 18) +
ggtitle(" Histogram to View Outliers")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p <- books_corti %>%
ggplot(aes(avg_book_rating, title_sentiment_avg)) +
xlab("Average Book Rating") + ylab("Title Sentiment") +
geom_point(color = "skyblue", alpha = 1/2, size = 0.5) +
theme_bw(base_size = 18) +
ggtitle("Exploring the Data: Visualization 1")
ggplotly(p)
p <- ggplot(books_corti %>%
mutate(genderMutated = fct_lump(gender, n = 10)),
aes(x = avg_book_rating, y = genderMutated, fill = genderMutated)) +
theme_minimal(base_size = 18) +
geom_density_ridges(color="black") +
xlab("Average Book Rating") +
ylab("Gender of Author") +
ggtitle(" Exploring the Data: Visualization 2")
p + theme(legend.position = "none")
## Picking joint bandwidth of 0.0532
str(books_corti)
## 'data.frame': 5914 obs. of 22 variables:
## $ bookID : chr "4" "12" "13" "14" ...
## $ title : chr "Harry Potter and the Chamber of Secrets (Harry Potter #2)" "The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy #1-5)" "The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1-5)" "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1)" ...
## $ author : chr "J.K. Rowling" "Douglas Adams" "Douglas Adams" "Douglas Adams" ...
## $ avg_book_rating : num 4.42 4.38 4.38 4.22 4.38 4.21 3.44 3.87 4.07 3.9 ...
## $ isbn : chr "0439554896" "0517226952" "0345453743" "1400052920" ...
## $ language_code : chr "eng" "eng" "eng" "eng" ...
## $ num_pages : num 352 815 815 215 815 544 55 256 335 304 ...
## $ book_ratings_count : int 6333 3628 249558 4930 2877 248558 7270 2088 72451 49240 ...
## $ text_reviews_count : num 244 254 4080 460 195 ...
## $ publication_date : Date, format: "2003-11-01" "2005-11-01" ...
## $ publisher : chr "Scholastic" "Gramercy Books" "Del Rey Books" "Crown" ...
## $ title_word_count : int 18 15 12 44 9 12 8 12 4 14 ...
## $ title_sentiment_avg : num 0 0.31 0.346 0.362 0.4 ...
## $ authorworkcount : int 242 103 103 103 103 59 59 59 59 59 ...
## $ author_fans : int 209174 19029 19029 19029 19029 14356 14356 14356 14356 14356 ...
## $ gender : Factor w/ 3 levels "female","male",..: 1 2 2 2 2 2 2 2 2 2 ...
## $ born : Date, format: "0065-07-31" "0052-03-11" ...
## $ died : Date, format: NA "0001-05-11" ...
## $ avg_author_rating : num 4.46 4.2 4.2 4.2 4.2 4.03 4.03 4.03 4.03 4.03 ...
## $ author_ratings_count: int 24511114 2624222 2624222 2624222 2624222 1272002 1272002 1272002 1272002 1272002 ...
## $ author_review_count : int 579250 57565 57565 57565 57565 76846 76846 76846 76846 76846 ...
## $ genre : chr "fantasy,fiction,young adult" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" ...
options(qwraps2_markup = "markdown")
view(books_corti)
our_summary1 <-
list("Average Book Rating" =
list("min" = ~ min(avg_book_rating),
"mean" = ~ mean(avg_book_rating),
"max" = ~ max(avg_book_rating),
"st. dev" = ~ sd(avg_book_rating)),
"Number of Pages" =
list("min" = ~ min(num_pages),
"mean" = ~ mean(num_pages),
"max" = ~ max(num_pages),
"st.dev" = ~ sd(num_pages)),
"Book Ratings Count" =
list("min" = ~ min(book_ratings_count),
"mean" = ~ mean(book_ratings_count),
"max" = ~ max(book_ratings_count),
"st. dev" = ~ sd(book_ratings_count)),
"Text Reviews Count" =
list("min" = ~ min(text_reviews_count),
"mean" = ~ mean(text_reviews_count),
"max" = ~ max(text_reviews_count),
"st. dev" = ~ sd(text_reviews_count)),
"Average Title Sentiment Score" =
list("min" = ~ min(title_sentiment_avg),
"mean" = ~ mean(title_sentiment_avg),
"max" = ~ max(title_sentiment_avg),
"st. dev" = ~ sd(title_sentiment_avg)),
"Author's Work Count" =
list("min" = ~ min(authorworkcount),
"mean" = ~ mean(authorworkcount),
"max" = ~ max(authorworkcount),
"st. dev" = ~ sd(authorworkcount)),
"Author's Fan Count" =
list("min" = ~ min(author_fans),
"mean" = ~ mean(author_fans),
"max" = ~ max(author_fans),
"st. dev" = ~ sd(author_fans)),
"Author Ratings Count" =
list("min" = ~ min(author_ratings_count),
"mean" = ~ mean(author_ratings_count),
"max" = ~ max(author_ratings_count),
"st. dev" = ~ sd(author_ratings_count)),
"Author Review Count" =
list("min" = ~ min(author_review_count),
"mean" = ~ mean(author_review_count),
"max" = ~ max(author_review_count),
"st. dev" = ~ sd(author_review_count))
)
sum_stats <- summary_table(books_corti, our_summary1) %>% round(1)
print(sum_stats)
##
##
## | |books_corti (N = 5,914) |
## |:---------------------------------|:-----------------------|
## |**Average Book Rating** | |
## | min |1 |
## | mean |3.9 |
## | max |5 |
## | st. dev |0.3 |
## |**Number of Pages** | |
## | min |10 |
## | mean |350.4 |
## | max |1952 |
## | st.dev |195.2 |
## |**Book Ratings Count** | |
## | min |0 |
## | mean |21480.8 |
## | max |4597666 |
## | st. dev |120423.5 |
## |**Text Reviews Count** | |
## | min |0 |
## | mean |696.7 |
## | max |94265 |
## | st. dev |2844.8 |
## |**Average Title Sentiment Score** | |
## | min |-1.4 |
## | mean |0 |
## | max |1.3 |
## | st. dev |0.3 |
## |**Author's Work Count** | |
## | min |1 |
## | mean |231 |
## | max |5204 |
## | st. dev |488.1 |
## |**Author's Fan Count** | |
## | min |0 |
## | mean |12050 |
## | max |709826 |
## | st. dev |55558.6 |
## |**Author Ratings Count** | |
## | min |27 |
## | mean |658708 |
## | max |24511114 |
## | st. dev |1727055.4 |
## |**Author Review Count** | |
## | min |1 |
## | mean |26511.1 |
## | max |579250 |
## | st. dev |58813.9 |
Need to load ‘rsample’ library here.
set.seed(1818)
train_prop <- 0.8
books_split <- initial_split(books_corti, prop = train_prop)
books_train <- training(books_split)
books_test <- testing(books_split)
nrow(books_train)
## [1] 4732
nrow(books_test)
## [1] 1182
head(books_train)
## bookID
## 1 4
## 2 12
## 3 13
## 4 14
## 5 18
## 6 21
## title
## 1 Harry Potter and the Chamber of Secrets (Harry Potter #2)
## 2 The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy #1-5)
## 3 The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1-5)
## 4 The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1)
## 5 The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy #1-5)
## 6 A Short History of Nearly Everything
## author avg_book_rating isbn language_code num_pages
## 1 J.K. Rowling 4.42 0439554896 eng 352
## 2 Douglas Adams 4.38 0517226952 eng 815
## 3 Douglas Adams 4.38 0345453743 eng 815
## 4 Douglas Adams 4.22 1400052920 eng 215
## 5 Douglas Adams 4.38 0517149257 eng 815
## 6 Bill Bryson 4.21 076790818X eng 544
## book_ratings_count text_reviews_count publication_date publisher
## 1 6333 244 2003-11-01 Scholastic
## 2 3628 254 2005-11-01 Gramercy Books
## 3 249558 4080 2002-04-30 Del Rey Books
## 4 4930 460 2004-08-03 Crown
## 5 2877 195 1996-01-17 Wings Books
## 6 248558 9396 2004-09-14 Broadway Books
## title_word_count title_sentiment_avg authorworkcount author_fans gender
## 1 18 0.0000000 242 209174 female
## 2 15 0.3098387 103 19029 male
## 3 12 0.3464102 103 19029 male
## 4 44 0.3618136 103 19029 male
## 5 9 0.4000000 103 19029 male
## 6 12 0.0000000 59 14356 male
## born died avg_author_rating author_ratings_count
## 1 0065-07-31 <NA> 4.46 24511114
## 2 0052-03-11 0001-05-11 4.20 2624222
## 3 0052-03-11 0001-05-11 4.20 2624222
## 4 0052-03-11 0001-05-11 4.20 2624222
## 5 0052-03-11 0001-05-11 4.20 2624222
## 6 0051-12-08 <NA> 4.03 1272002
## author_review_count genre
## 1 579250 fantasy,fiction,young adult
## 2 57565 comedy,fiction,mystery and thrillers
## 3 57565 comedy,fiction,mystery and thrillers
## 4 57565 comedy,fiction,mystery and thrillers
## 5 57565 comedy,fiction,mystery and thrillers
## 6 76846 non fiction,travel
Need ‘dplyr’, ‘glmnet’, and ‘glmnetUtils’ libraries here.
options(scipen = 999)
mod1 <- lm(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender, data = books_train)
summary(mod1)
##
## Call:
## lm(formula = avg_book_rating ~ num_pages + book_ratings_count +
## text_reviews_count + title_sentiment_avg + authorworkcount +
## author_fans + author_ratings_count + author_review_count +
## gender, data = books_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8325 -0.1575 0.0139 0.1787 1.1391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.79675179461 0.01058977825 358.530 < 0.0000000000000002
## num_pages 0.00027651722 0.00002059907 13.424 < 0.0000000000000002
## book_ratings_count -0.00000005966 0.00000007126 -0.837 0.402464
## text_reviews_count 0.00000817665 0.00000320658 2.550 0.010805
## title_sentiment_avg 0.04097254653 0.01483542542 2.762 0.005771
## authorworkcount 0.00003276625 0.00000871417 3.760 0.000172
## author_fans 0.00000013706 0.00000013538 1.012 0.311392
## author_ratings_count 0.00000006167 0.00000000699 8.823 < 0.0000000000000002
## author_review_count -0.00000172266 0.00000025130 -6.855 0.00000000000804
## gendermale 0.03361376024 0.00913831936 3.678 0.000237
## genderunknown 0.01812663016 0.01336993658 1.356 0.175236
##
## (Intercept) ***
## num_pages ***
## book_ratings_count
## text_reviews_count *
## title_sentiment_avg **
## authorworkcount ***
## author_fans
## author_ratings_count ***
## author_review_count ***
## gendermale ***
## genderunknown
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.275 on 4721 degrees of freedom
## Multiple R-squared: 0.06875, Adjusted R-squared: 0.06678
## F-statistic: 34.85 on 10 and 4721 DF, p-value: < 0.00000000000000022
#——————————————————– # estimating “prettier” regression output #——————————————————–
Need ‘sjPlot’ and ‘tidymodels’ libraries.
#——————————————————– tab_model() outputs a table of results #——————————————————–
tab_model(mod1, digits = 3)
| avg book rating | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 3.797 | 3.776 – 3.818 | <0.001 |
| num_pages | 0.000 | 0.000 – 0.000 | <0.001 |
| book_ratings_count | -0.000 | -0.000 – 0.000 | 0.402 |
| text_reviews_count | 0.000 | 0.000 – 0.000 | 0.011 |
| title_sentiment_avg | 0.041 | 0.012 – 0.070 | 0.006 |
| authorworkcount | 0.000 | 0.000 – 0.000 | <0.001 |
| author_fans | 0.000 | -0.000 – 0.000 | 0.311 |
| author_ratings_count | 0.000 | 0.000 – 0.000 | <0.001 |
| author_review_count | -0.000 | -0.000 – -0.000 | <0.001 |
| gender [male] | 0.034 | 0.016 – 0.052 | <0.001 |
| gender [unknown] | 0.018 | -0.008 – 0.044 | 0.175 |
| Observations | 4732 | ||
| R2 / R2 adjusted | 0.069 / 0.067 | ||
#——————————————————– plot_model() outputs a plot of regression coefficients #——————————————————–
plot_model(mod1)+ ylim(-0.1,0.1) + ggtitle(" Average Book Rating Coefficients") + theme_minimal(base_size = 16)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
#——————————————————– tidy() outputs a table of coefficients and their p-values, t-stats #——————————————————–
tidy(mod1)
## # A tibble: 11 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.80 0.0106 359. 0.
## 2 num_pages 0.000277 0.0000206 13.4 2.39e-40
## 3 book_ratings_count -0.0000000597 0.0000000713 -0.837 4.02e- 1
## 4 text_reviews_count 0.00000818 0.00000321 2.55 1.08e- 2
## 5 title_sentiment_avg 0.0410 0.0148 2.76 5.77e- 3
## 6 authorworkcount 0.0000328 0.00000871 3.76 1.72e- 4
## 7 author_fans 0.000000137 0.000000135 1.01 3.11e- 1
## 8 author_ratings_count 0.0000000617 0.00000000699 8.82 1.55e-18
## 9 author_review_count -0.00000172 0.000000251 -6.86 8.04e-12
## 10 gendermale 0.0336 0.00914 3.68 2.37e- 4
## 11 genderunknown 0.0181 0.0134 1.36 1.75e- 1
Note: We used an alpha sequence from 0 to 1 in steps of 0.1.
enet_mod <- cva.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
data = books_train,
alpha = seq(0,1, by = 0.1))
print(enet_mod)
## Call:
## cva.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count +
## text_reviews_count + title_sentiment_avg + authorworkcount +
## author_fans + author_ratings_count + author_review_count +
## gender, data = books_train, alpha = seq(0, 1, by = 0.1))
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Alpha values: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
## Number of crossvalidation folds for lambda: 10
plot(enet_mod)
minlossplot(enet_mod,
cv.type = "min")
# Use this function to find the best alpha.
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
# Get all parameters.
get_model_params <- function(fit) {
alpha <- fit$alpha
lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
best <- which.min(error)
data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
lambdaSE = lambdaSE[best], error = error[best])
}
# Extract the best alpha value & model parameters.
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
## [1] 1
get_model_params(enet_mod)
## alpha lambdaMin lambdaSE error
## 1 1 0.0001936006 0.01273763 0.07581943
# Extract the best model object.
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]
enet_best_mod <- cv.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
data = books_train,
alpha = 0.1)
summary(enet_best_mod)
## Length Class Mode
## lambda 92 -none- numeric
## cvm 92 -none- numeric
## cvsd 92 -none- numeric
## cvup 92 -none- numeric
## cvlo 92 -none- numeric
## nzero 92 -none- numeric
## call 4 -none- call
## name 1 -none- character
## glmnet.fit 12 elnet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## terms 2 -none- call
## xlev 9 -none- list
## alpha 1 -none- numeric
## nfolds 1 -none- numeric
## sparse 1 -none- logical
## use.model.frame 1 -none- logical
## na.action 1 -none- character
print(enet_best_mod)
## Call:
## cv.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count +
## text_reviews_count + title_sentiment_avg + authorworkcount +
## author_fans + author_ratings_count + author_review_count +
## gender, data = books_train, alpha = 0.1)
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Number of crossvalidation folds: 10
## Alpha: 0.1
## Deviance-minimizing lambda: 0.0001430848 (+1 SE): 0.3229453
Print the model’s two suggested values of lambda.
print(enet_best_mod$lambda.min)
## [1] 0.0001430848
print(enet_best_mod$lambda.1se)
## [1] 0.3229453
Plot how the MSE varies as we vary lambda.
plot(enet_best_mod)
coefpath(enet_best_mod)
Compare lambda min & lambda 1SE…
# put into coefficient vector
enet_coefs <- data.frame(
`lasso_min` = coef(enet_best_mod, s = enet_best_mod$lambda.min) %>%
as.matrix() %>% data.frame() %>% round(3),
`lasso_1se` = coef(enet_best_mod, s = enet_best_mod$lambda.1se) %>%
as.matrix() %>% data.frame() %>% round(3)
) %>% rename(`lasso_min` = 1, `lasso_1se` = 2)
print(enet_coefs)
## lasso_min lasso_1se
## (Intercept) 3.815 3.903
## num_pages 0.000 0.000
## book_ratings_count 0.000 0.000
## text_reviews_count 0.000 0.000
## title_sentiment_avg 0.041 0.000
## authorworkcount 0.000 0.000
## author_fans 0.000 0.000
## author_ratings_count 0.000 0.000
## author_review_count 0.000 0.000
## genderfemale -0.018 0.000
## gendermale 0.015 0.000
## genderunknown 0.000 0.000
enet_coefs %>% kable() %>% kable_styling()
| lasso_min | lasso_1se | |
|---|---|---|
| (Intercept) | 3.815 | 3.903 |
| num_pages | 0.000 | 0.000 |
| book_ratings_count | 0.000 | 0.000 |
| text_reviews_count | 0.000 | 0.000 |
| title_sentiment_avg | 0.041 | 0.000 |
| authorworkcount | 0.000 | 0.000 |
| author_fans | 0.000 | 0.000 |
| author_ratings_count | 0.000 | 0.000 |
| author_review_count | 0.000 | 0.000 |
| genderfemale | -0.018 | 0.000 |
| gendermale | 0.015 | 0.000 |
| genderunknown | 0.000 | 0.000 |
Need ‘partykit’, ‘PerformanceAnalytics’, ‘rpart’, ‘rpart.plot’, and ‘randomForest’ libraries.
options(scipen = 10)
#set.seed(1818)
# store row names as columns
books_boot_preds <- books_corti %>% rownames_to_column() %>%
mutate(rowname = as.numeric(rowname))
B <- 100 # number of bootstrap samples
num_b <- 500 # sample size of each bootstrap
boot_mods <- list() # store our bagging models
for(i in 1:B){
boot_idx <- sample(1:nrow(books_corti),
size = num_b,
replace = FALSE)
# fit a tree on each bootstrap sample
boot_tree <- ctree(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count+ gender,
data = books_corti %>%
slice(boot_idx))
# store bootstraped model
boot_mods[[i]] <- boot_tree
# generate predictions for that bootstrap model
preds_boot <- data.frame(
preds_boot = predict(boot_tree),
rowname = boot_idx
)
# rename prediction to indicate which boot iteration it came from
names(preds_boot)[1] <- paste("preds_boot",i,sep = "")
# merge predictions to dataset
books_boot_preds <- left_join(x = books_boot_preds, y = preds_boot,
by = "rowname")
}
#——————————————————– plot() examines an individual model from bagging #——————————————————–
plot(boot_mods[[1]], gp = gpar(fontsize = 8))
books_boot_preds <- books_boot_preds %>%
mutate(preds_bag =
select(., preds_boot1:preds_boot100) %>%
rowMeans(na.rm = TRUE))
# NOTE: At this point in the code, the model has been bootstrapped.
rf_fit <- randomForest(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
data = books_corti,
type = regression,
mtry = 11/3,
ntree = 200,
importance = TRUE)
print(rf_fit)
##
## Call:
## randomForest(formula = avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender, data = books_corti, type = regression, mtry = 11/3, ntree = 200, importance = TRUE)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.06013366
## % Var explained: 24.51
plot(rf_fit)
varImpPlot(rf_fit, type = 1)
plot_min_depth_distribution(rf_fit)
plot_predict_interaction(rf_fit, books_corti, "author_ratings_count", "num_pages")
plot_predict_interaction(rf_fit, books_corti, "authorworkcount", "num_pages")
plot_predict_interaction(rf_fit, books_corti, "num_pages", "title_sentiment_avg")
Storing predictions data frames for Linear and ElasticNet models…
lm_preds_train <- predict(mod1, newdata = books_train)
lm_preds_test <- predict(mod1,
newdata = books_test)
enet_preds_train <- predict(enet_best_mod,
newdata = books_train, s = "lambda.min")
enet_preds_test <- predict(enet_best_mod,
newdata = books_test, s = "lambda.min")
head(lm_preds_train)
## 1 2 3 4 5 6
## 4.446006 4.138931 4.157041 3.976757 4.142188 3.992752
head(lm_preds_test)
## 10 20 21 38 39 43
## 3.874056 3.856204 3.880741 3.859920 3.838027 3.983870
head(enet_preds_train)
## 1
## 1 4.439726
## 2 4.137918
## 3 4.156343
## 4 3.975785
## 5 4.141186
## 6 3.993360
head(enet_preds_test)
## 1
## 10 3.875319
## 20 3.856850
## 21 3.880675
## 38 3.859698
## 39 3.837804
## 43 3.983638
Storing results data frames for Linear and ElasticNet models…
training_predictions <- data.frame(lm_preds_train, enet_preds_train)
results_train <- data.frame(books_train, training_predictions) %>% rename(enet_training = X1)
head(results_train)
## bookID
## 1 4
## 2 12
## 3 13
## 4 14
## 5 18
## 6 21
## title
## 1 Harry Potter and the Chamber of Secrets (Harry Potter #2)
## 2 The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy #1-5)
## 3 The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1-5)
## 4 The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy #1)
## 5 The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy #1-5)
## 6 A Short History of Nearly Everything
## author avg_book_rating isbn language_code num_pages
## 1 J.K. Rowling 4.42 0439554896 eng 352
## 2 Douglas Adams 4.38 0517226952 eng 815
## 3 Douglas Adams 4.38 0345453743 eng 815
## 4 Douglas Adams 4.22 1400052920 eng 215
## 5 Douglas Adams 4.38 0517149257 eng 815
## 6 Bill Bryson 4.21 076790818X eng 544
## book_ratings_count text_reviews_count publication_date publisher
## 1 6333 244 2003-11-01 Scholastic
## 2 3628 254 2005-11-01 Gramercy Books
## 3 249558 4080 2002-04-30 Del Rey Books
## 4 4930 460 2004-08-03 Crown
## 5 2877 195 1996-01-17 Wings Books
## 6 248558 9396 2004-09-14 Broadway Books
## title_word_count title_sentiment_avg authorworkcount author_fans gender
## 1 18 0.0000000 242 209174 female
## 2 15 0.3098387 103 19029 male
## 3 12 0.3464102 103 19029 male
## 4 44 0.3618136 103 19029 male
## 5 9 0.4000000 103 19029 male
## 6 12 0.0000000 59 14356 male
## born died avg_author_rating author_ratings_count
## 1 0065-07-31 <NA> 4.46 24511114
## 2 0052-03-11 0001-05-11 4.20 2624222
## 3 0052-03-11 0001-05-11 4.20 2624222
## 4 0052-03-11 0001-05-11 4.20 2624222
## 5 0052-03-11 0001-05-11 4.20 2624222
## 6 0051-12-08 <NA> 4.03 1272002
## author_review_count genre lm_preds_train
## 1 579250 fantasy,fiction,young adult 4.446006
## 2 57565 comedy,fiction,mystery and thrillers 4.138931
## 3 57565 comedy,fiction,mystery and thrillers 4.157041
## 4 57565 comedy,fiction,mystery and thrillers 3.976757
## 5 57565 comedy,fiction,mystery and thrillers 4.142188
## 6 76846 non fiction,travel 3.992752
## enet_training
## 1 4.439726
## 2 4.137918
## 3 4.156343
## 4 3.975785
## 5 4.141186
## 6 3.993360
testing_predictions <- data.frame(
"lm_testing" = lm_preds_test,
"enet_testing" = enet_preds_test)
results_test <- data.frame(books_test, testing_predictions) %>% rename(enet_testing = X1)
head(results_test)
## bookID
## 10 25
## 20 53
## 21 55
## 38 86
## 39 89
## 43 105
## title
## 10 I'm a Stranger Here Myself: Notes on Returning to America After Twenty Years Away
## 20 Guts: The True Stories behind Hatchet and the Brian Books
## 21 Hatchet Jobs: Writings on Contemporary Fiction
## 38 The Heidi Chronicles: Uncommon Women and Others & Isn't It Romantic
## 39 Active Literacy Across the Curriculum: Strategies for Reading Writing Speaking and Listening
## 43 Chapterhouse: Dune (Dune Chronicles #6)
## author avg_book_rating isbn language_code num_pages
## 10 Bill Bryson 3.90 076790382X eng 304
## 20 Gary Paulsen 3.88 0385326505 eng 144
## 21 Dale Peck 3.45 1595580271 en-US 228
## 38 Wendy Wasserstein 3.84 0679734996 eng 249
## 39 Heidi Hayes Jacobs 3.94 1596670231 eng 138
## 43 Frank Herbert 3.91 0441102670 eng 436
## book_ratings_count text_reviews_count publication_date publisher
## 10 49240 2211 2000-06-28 Broadway Books
## 20 2067 334 2001-01-23 Delacorte Press
## 21 99 16 2005-11-01 The New Press
## 38 2766 64 1991-07-02 Vintage
## 39 31 1 2006-03-29 Routledge
## 43 38778 568 1987-07-01 Ace Books
## title_word_count title_sentiment_avg authorworkcount author_fans gender
## 10 14 -0.13363062 59 14356 male
## 20 10 0.07905694 224 2321 male
## 21 6 0.06123724 26 91 unknown
## 38 10 -0.15811388 35 55 female
## 39 12 0.02886751 61 8 female
## 43 4 0.00000000 308 7613 male
## born died avg_author_rating author_ratings_count
## 10 0051-12-08 <NA> 4.03 1272002
## 20 0039-05-17 <NA> 3.79 461181
## 21 0067-06-13 <NA> 3.67 6261
## 38 0050-10-18 0006-01-30 3.63 8656
## 39 0048-10-08 <NA> 3.76 801
## 43 0020-10-08 0086-02-11 4.10 1190679
## author_review_count genre lm_testing
## 10 76846 non fiction,travel 3.874056
## 20 32464 fiction,literature,nature,outdoors 3.856204
## 21 620 sex,young adult 3.880741
## 38 727 fiction,literature 3.859920
## 39 71 non fiction 3.838027
## 43 31318 fantasy,fiction 3.983870
## enet_testing
## 10 3.875319
## 20 3.856850
## 21 3.880675
## 38 3.859698
## 39 3.837804
## 43 3.983638
ggplot(results_train, aes(x = avg_book_rating, y = lm_preds_train)) +
geom_point(alpha = 1/10, size = 4) +
theme_minimal(base_size = 16)+
geom_abline(color = "turquoise")+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Linear Regression: Training True vs Predicted")
ggplot(results_train, aes(x = avg_book_rating, y = enet_preds_train)) +
geom_point(alpha = 1/10, size = 4) +
theme_minimal(base_size = 16)+
geom_abline(color = "turquoise")+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Best ElasticNet: Training True vs Predicted")
ggplot(results_test, aes(x = avg_book_rating, y = lm_preds_test)) +
geom_point(alpha = 1/10, size = 4) +
geom_abline(color = "coral")+
theme_minimal(base_size = 16)+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Linear Regression: Testing True vs Predicted")
ggplot(results_test, aes(x = avg_book_rating, y = enet_preds_test)) +
geom_point(alpha = 1/10, size = 4) +
geom_abline(color = "coral")+
theme_minimal(base_size = 16)+
xlab("True Average Ratings")+
ylab("Predicted Average Ratings")+
xlim(0, 5) + ylim(0, 5)+
ggtitle(" Best ElasticNet: Testing True vs Predicted")
#——————————————————– # 21) MODEL EVALUATION #——————————————————–
rmse(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.275
mae(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 0.209
rsq(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0687
lm_rmse <- rmse(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_mae <- mae(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_rsq <- rsq(books_test, truth = avg_book_rating, estimate = lm_preds_test)
rmse(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.275
mae(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 0.209
rsq(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0687
enet_rmse <- rmse(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_mae <- mae(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_rsq <- rsq(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
Tree OUT-OF-BAG Predictions…
books_right_join <- right_join(books_corti, books_boot_preds)
## Joining, by = c("bookID", "title", "author", "avg_book_rating", "isbn", "language_code", "num_pages", "book_ratings_count", "text_reviews_count", "publication_date", "publisher", "title_word_count", "title_sentiment_avg", "authorworkcount", "author_fans", "gender", "born", "died", "avg_author_rating", "author_ratings_count", "author_review_count", "genre")
books_right_join <- books_right_join %>% ungroup()
tree_rmse <- rmse(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_mae <- mae(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_rsq <- rsq(books_right_join, truth = avg_book_rating, estimate = preds_bag)
Random Forest OUT-OF-BAG Predictions…
preds_OOB <- predict(rf_fit)
rf_rsq <- rsq(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_rmse <- rmse(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_mae <- mae(books_corti, truth = avg_book_rating, estimate = preds_OOB)
All testing data predictions…
rsq_DF <- merge(rf_rsq, enet_rsq, by=c(".metric", ".estimator"))
rsq_DF1 <- merge(rsq_DF, lm_rsq, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate)
rsq_DF2 <- merge(rsq_DF1, tree_rsq, by=c(".metric", ".estimator")) %>% select(-.estimator)
print(rsq_DF2)
## .metric Random Forest ElasticNet Linear .estimate
## 1 rsq 0.24535 0.06579101 0.06588606 0.1084553
rmse_DF <- merge(rf_rmse, enet_rmse, by=c(".metric", ".estimator"))
rmse_DF1 <- merge(rmse_DF, lm_rmse, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate)
rmse_DF2 <- merge(rmse_DF1, tree_rmse, by=c(".metric", ".estimator")) %>% select(-.estimator)
print(rmse_DF2)
## .metric Random Forest ElasticNet Linear .estimate
## 1 rmse 0.2452216 0.2633069 0.2633021 0.2692242
mae_DF <- merge(rf_mae, enet_mae, by=c(".metric", ".estimator"))
mae_DF1 <- merge(mae_DF, lm_mae, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate)
mae_DF2 <- merge(mae_DF1, tree_mae, by=c(".metric", ".estimator")) %>% select(-.estimator)
print(mae_DF2)
## .metric Random Forest ElasticNet Linear .estimate
## 1 mae 0.1795154 0.2016244 0.2016341 0.2051364
total <- rbind(rsq_DF2, rmse_DF2)
final <-rbind(total, mae_DF2) %>% rename("Tree" = .estimate, "Metrics" = .metric)
print(final)
## Metrics Random Forest ElasticNet Linear Tree
## 1 rsq 0.2453500 0.06579101 0.06588606 0.1084553
## 2 rmse 0.2452216 0.26330694 0.26330206 0.2692242
## 3 mae 0.1795154 0.20162440 0.20163410 0.2051364
Credit for the code below: https://rfortherestofus.com/2019/11/how-to-make-beautiful-tables-in-r/
Need to load ‘kableExtra’ library.
final %>% kable() %>% kable_styling()
| Metrics | Random Forest | ElasticNet | Linear | Tree |
|---|---|---|---|---|
| rsq | 0.2453500 | 0.0657910 | 0.0658861 | 0.1084553 |
| rmse | 0.2452216 | 0.2633069 | 0.2633021 | 0.2692242 |
| mae | 0.1795154 | 0.2016244 | 0.2016341 | 0.2051364 |
Credit for code below: https://www.littlemissdata.com/blog/prettytables
Need to load ‘formattable’, ‘tidyr’, and ‘data.table’ libraries.
custom_one = "#CCCCFF"
custom_two = "skyblue"
custom_three = "#4ec5a5"
custom_coral = "#FA7268"
# custom_green = "#00AD43"
formattable(final,
align =c("l","c","c","c","c", "c", "c", "c", "r"),
list(`Metrics` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold")),
`Random Forest`= color_tile(custom_one, custom_one),
`ElasticNet`= color_tile(custom_two, custom_two),
`Linear`= color_tile(custom_three, custom_three),
`Tree`= color_tile(custom_coral, custom_coral)
))
| Metrics | Random Forest | ElasticNet | Linear | Tree |
|---|---|---|---|---|
| rsq | 0.2453500 | 0.06579101 | 0.06588606 | 0.1084553 |
| rmse | 0.2452216 | 0.26330694 | 0.26330206 | 0.2692242 |
| mae | 0.1795154 | 0.20162440 | 0.20163410 | 0.2051364 |